Rendering HTML¶

In [ ]:
import plotly
plotly.offline.init_notebook_mode()

Framing the Problem¶

Problem is the risk analysis of patients with diabetes.

Import libraries & Load Dataset¶

In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import datasets

# Load the diabetes dataset
data_diabetes = datasets.load_diabetes()
data_diabetes
Out[ ]:
{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
        128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
        150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
        200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
         42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
         83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
        104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
        173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
        107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
         60., 174., 259., 178., 128.,  96., 126., 288.,  88., 292.,  71.,
        197., 186.,  25.,  84.,  96., 195.,  53., 217., 172., 131., 214.,
         59.,  70., 220., 268., 152.,  47.,  74., 295., 101., 151., 127.,
        237., 225.,  81., 151., 107.,  64., 138., 185., 265., 101., 137.,
        143., 141.,  79., 292., 178.,  91., 116.,  86., 122.,  72., 129.,
        142.,  90., 158.,  39., 196., 222., 277.,  99., 196., 202., 155.,
         77., 191.,  70.,  73.,  49.,  65., 263., 248., 296., 214., 185.,
         78.,  93., 252., 150.,  77., 208.,  77., 108., 160.,  53., 220.,
        154., 259.,  90., 246., 124.,  67.,  72., 257., 262., 275., 177.,
         71.,  47., 187., 125.,  78.,  51., 258., 215., 303., 243.,  91.,
        150., 310., 153., 346.,  63.,  89.,  50.,  39., 103., 308., 116.,
        145.,  74.,  45., 115., 264.,  87., 202., 127., 182., 241.,  66.,
         94., 283.,  64., 102., 200., 265.,  94., 230., 181., 156., 233.,
         60., 219.,  80.,  68., 332., 248.,  84., 200.,  55.,  85.,  89.,
         31., 129.,  83., 275.,  65., 198., 236., 253., 124.,  44., 172.,
        114., 142., 109., 180., 144., 163., 147.,  97., 220., 190., 109.,
        191., 122., 230., 242., 248., 249., 192., 131., 237.,  78., 135.,
        244., 199., 270., 164.,  72.,  96., 306.,  91., 214.,  95., 216.,
        263., 178., 113., 200., 139., 139.,  88., 148.,  88., 243.,  71.,
         77., 109., 272.,  60.,  54., 221.,  90., 311., 281., 182., 321.,
         58., 262., 206., 233., 242., 123., 167.,  63., 197.,  71., 168.,
        140., 217., 121., 235., 245.,  40.,  52., 104., 132.,  88.,  69.,
        219.,  72., 201., 110.,  51., 277.,  63., 118.,  69., 273., 258.,
         43., 198., 242., 232., 175.,  93., 168., 275., 293., 281.,  72.,
        140., 189., 181., 209., 136., 261., 113., 131., 174., 257.,  55.,
         84.,  42., 146., 212., 233.,  91., 111., 152., 120.,  67., 310.,
         94., 183.,  66., 173.,  72.,  49.,  64.,  48., 178., 104., 132.,
        220.,  57.]),
 'frame': None,
 'DESCR': '.. _diabetes_dataset:\n\nDiabetes dataset\n----------------\n\nTen baseline variables, age, sex, body mass index, average blood\npressure, and six blood serum measurements were obtained for each of n =\n442 diabetes patients, as well as the response of interest, a\nquantitative measure of disease progression one year after baseline.\n\n**Data Set Characteristics:**\n\n  :Number of Instances: 442\n\n  :Number of Attributes: First 10 columns are numeric predictive values\n\n  :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n\n  :Attribute Information:\n      - age     age in years\n      - sex\n      - bmi     body mass index\n      - bp      average blood pressure\n      - s1      tc, total serum cholesterol\n      - s2      ldl, low-density lipoproteins\n      - s3      hdl, high-density lipoproteins\n      - s4      tch, total cholesterol / HDL\n      - s5      ltg, possibly log of serum triglycerides level\n      - s6      glu, blood sugar level\n\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1).\n\nSource URL:\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n\nFor more information see:\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n',
 'feature_names': ['age',
  'sex',
  'bmi',
  'bp',
  's1',
  's2',
  's3',
  's4',
  's5',
  's6'],
 'data_filename': 'diabetes_data_raw.csv.gz',
 'target_filename': 'diabetes_target.csv.gz',
 'data_module': 'sklearn.datasets.data'}

Converting to dataframe¶

In [ ]:
df_diabetes = pd.DataFrame(data_diabetes.data,columns=data_diabetes.feature_names)
df_diabetes['target'] = data_diabetes.target
df_diabetes.head()
Out[ ]:
age sex bmi bp s1 s2 s3 s4 s5 s6 target
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019907 -0.017646 151.0
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068332 -0.092204 75.0
2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356 -0.002592 0.002861 -0.025930 141.0
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 0.034309 0.022688 -0.009362 206.0
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 -0.002592 -0.031988 -0.046641 135.0

Exploratory Data Analysis¶

Descibe the data¶

In [ ]:
df_describe = df_diabetes.describe()
df_describe
Out[ ]:
age sex bmi bp s1 s2 s3 s4 s5 s6 target
count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 442.000000
mean -2.511817e-19 1.230790e-17 -2.245564e-16 -4.797570e-17 -1.381499e-17 3.918434e-17 -5.777179e-18 -9.042540e-18 9.293722e-17 1.130318e-17 152.133484
std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 77.093005
min -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123988e-01 -1.267807e-01 -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260971e-01 -1.377672e-01 25.000000
25% -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665608e-02 -3.424784e-02 -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324559e-02 -3.317903e-02 87.000000
50% 5.383060e-03 -4.464164e-02 -7.283766e-03 -5.670422e-03 -4.320866e-03 -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947171e-03 -1.077698e-03 140.500000
75% 3.807591e-02 5.068012e-02 3.124802e-02 3.564379e-02 2.835801e-02 2.984439e-02 2.931150e-02 3.430886e-02 3.243232e-02 2.791705e-02 211.500000
max 1.107267e-01 5.068012e-02 1.705552e-01 1.320436e-01 1.539137e-01 1.987880e-01 1.811791e-01 1.852344e-01 1.335973e-01 1.356118e-01 346.000000
  • Dataset contains 442 data.
  • For all the features mean close to 0 and standard deviation close to 1. But for the target variable mean is 152 & standard deviation is 77 which indicates variability.

Plot graphs for each feature and target to find the insights¶

In [ ]:
df_diabetes.hist(figsize=(12,10))
plt.show()
  • The histograms indicates that age,bmi,bp,s1,s2,s3,s4,s5,s6 are centered around mean.
  • But the target variable is rightly skewed, that means higher number of patients with lower risk of diabetes progression and there are only less number of patients with high risk of diabetes progression.

Correlation Matrix¶

In [ ]:
df_diabetes_corr = df_diabetes.corr()
plt.figure(figsize=(12,10))
sns.heatmap(df_diabetes_corr, annot=True)
plt.title('Correlation Matrix of Diabetes Dataset')
plt.show()
  • s3 shows very less correlation with other features and taget.
  • All the features except s3 shows a positive correlation with target variable.
  • BMI shows highest correlation with target among the features. That means change in BMI affects risk of diabetes progression high when comapred to other features.
  • s5 also has good enough correlation with target value.
  • So, BMI & s5 has more importance in predicting the risk of diabetes progression.

Cleaning the data¶

  • The given dataset is need not to be cleaned. Because it is mean centered to 0 and scaled by standard deviation of 1.

Split the dataset¶

In [ ]:
from sklearn.model_selection import train_test_split

# We are taking only BMI & Taget columns because it has already stated that 
# BMI is the independent variable and tage is the dependent variable.

X = df_diabetes[['bmi']]
y= df_diabetes['target']

X_train, X_old, y_train, y_old = train_test_split(X, y, test_size=0.3)

X_val,X_test,y_val,y_test = train_test_split(X_old, y_old, test_size=0.5)

print(X_train.shape, X_val.shape, X_test.shape)
(309, 1) (66, 1) (67, 1)

Polynomial Regression on BMI v/s diesease progression¶

In [ ]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

def create_poly_model(X,y,degrees):
    models = {}
    for degree in degrees:
        model = Pipeline([('polynomial', PolynomialFeatures(degree=degree)),
                          ('linear', LinearRegression())])
        model.fit(X, y)
        models[degree] = model
    return models
    

degrees = list(range(0, 6))
models = create_poly_model(X_train, y_train, degrees)

# print models
for degree, model in models.items():
    print(f'Degree: {degree}')
    print(f'Model: {model}\n')
Degree: 0
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=0)),
                ('linear', LinearRegression())])

Degree: 1
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=1)),
                ('linear', LinearRegression())])

Degree: 2
Model: Pipeline(steps=[('polynomial', PolynomialFeatures()),
                ('linear', LinearRegression())])

Degree: 3
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=3)),
                ('linear', LinearRegression())])

Degree: 4
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=4)),
                ('linear', LinearRegression())])

Degree: 5
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=5)),
                ('linear', LinearRegression())])

Comapring Models¶

Report : R-Squared,MAPE,MSE for all models¶

In [ ]:
from sklearn.metrics import r2_score, mean_absolute_error

# MAPE function
def mape(y_act, y_pred):
    return np.mean(np.abs((y_act - y_pred) / y_act)) * 100

for degree, model in models.items():
    # Predictions for train and validation set
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    print(f'Degree: {degree}')
    # Train Data
    print(f'    Train R2: {r2_score(y_train, y_train_pred)}')
    print(f'    Train MAE: {mean_absolute_error(y_train, y_train_pred)}')
    print(f'    Train MAPE: {mape(y_train, y_train_pred)}\n')
   
    # Validation Data
    print(f'    Val R2: {r2_score(y_val, y_val_pred)}')
    print(f'    Val MAE: {mean_absolute_error(y_val, y_val_pred)}')
    print(f'    Val MAPE: {mape(y_val, y_val_pred)}\n')
Degree: 0
    Train R2: 0.0
    Train MAE: 68.04991569003258
    Train MAPE: 66.01799967036573

    Val R2: -0.07823669866768101
    Val MAE: 63.83132293811905
    Val MAPE: 65.51186103269214

Degree: 1
    Train R2: 0.347313755584002
    Train MAE: 53.59447779817888
    Train MAPE: 50.08844301550417

    Val R2: 0.21091847464819513
    Val MAE: 50.22058633449763
    Val MAPE: 51.338329603688415

Degree: 2
    Train R2: 0.3473472724512475
    Train MAE: 53.58839529195861
    Train MAPE: 50.096427869352745

    Val R2: 0.20985365982989612
    Val MAE: 50.24978480638808
    Val MAPE: 51.302737130412304

Degree: 3
    Train R2: 0.3477847677130369
    Train MAE: 53.578024493918136
    Train MAPE: 50.16170777420983

    Val R2: 0.1937285394232039
    Val MAE: 50.84577282430887
    Val MAPE: 51.726016108969205

Degree: 4
    Train R2: 0.35429558760846136
    Train MAE: 53.425452957113706
    Train MAPE: 49.7129401178764

    Val R2: 0.0936949287419161
    Val MAE: 51.950894136546786
    Val MAPE: 51.50799405837877

Degree: 5
    Train R2: 0.3576349367439684
    Train MAE: 53.138307485968724
    Train MAPE: 49.57105937283405

    Val R2: 0.1683189237120759
    Val MAE: 50.74025554300594
    Val MAPE: 50.49048570138156

  • Model with degree 1 is the best model.
  • It has better R-sqaured value for validation data than other models an have a comapareable value for training. So, it helps to predict the unseen data.

Conclusion¶

Run the Chosen model with test data¶

In [ ]:
from sklearn.metrics import mean_squared_error

models[1].fit(X_test,y_test)  # 5 is the degree of the polynomial model in the models dictionary
y_test_pred = models[1].predict(X_test)

# Evaluating the model with test data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print('Test Data')
print(f'R2 score: {r2_test}')
print(f'MSE: {mse_test}')
print(f'MAE: {mae_test}')
Test Data
R2 score: 0.38141794551899577
MSE: 3103.6980457882028
MAE: 44.78346230138723
In [ ]:
# Plotting the model
plt.figure(figsize=(10,8))
plt.scatter(X_train, y_train, color='blue', label='Train Data')
plt.scatter(X_val, y_val, color='red', label='Validation Data')
plt.scatter(X_test, y_test, color='green', label='Test Data')

plt.plot(X_test, y_test_pred, color='black', label='Degree 1 test data')
plt.plot(X_train, models[1].predict(X_train), color='yellow', label='Degree 1 train data')
plt.plot(X_val, models[1].predict(X_val), color='orange', label='Degree 1 val data')

plt.title('Polynomial Regression with model degree 1')
plt.xlabel('BMI')
plt.ylabel('Target')
plt.legend()
plt.show()

Equation of the predicted model¶

In [ ]:
def print_pipeline_model_stats(model):
    # print model    
    print(f'Model: {model}')
    print(f'Coefficients: {model[-1].coef_}')
    print(f'Intercept: {model[-1].intercept_}')
    # generate equation string:
    equation = 'y = '
    for i, coef in enumerate(model[-1].coef_):
        equation += f'{coef:.2f} * x^{i} + '
    equation += f'{model[-1].intercept_:.2f}'
    print(f'Equation: {equation}')

print_pipeline_model_stats(models[1])
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=1)),
                ('linear', LinearRegression())])
Coefficients: [   0.         1015.39430542]
Intercept: 148.53836472695116
Equation: y = 0.00 * x^0 + 1015.39 * x^1 + 148.54

Predict manually with bmi value = 0.05¶

In [ ]:
bmi_manual = 0.05
y_pred_manual = 1015.39 * (bmi_manual)**1 + 148.54
print(y_pred_manual)

y_model = models[1].predict([[bmi_manual]])
print(y_model)
199.30949999999999
[199.30808]
d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\sklearn\base.py:439: UserWarning:

X does not have valid feature names, but PolynomialFeatures was fitted with feature names

  • After predicting and manual calculation we got same value for both.

Trainable Parameters for 6 models¶

In [ ]:
trainable_params = {}
for degree in range(6):
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X_train)
    params_count = X_poly.shape[1] 
    trainable_params[degree] = params_count
    print(f'Degree {degree}: {poly.get_feature_names_out()}')

degrees = list(trainable_params.keys())
params = list(trainable_params.values())
print('Degrees:', degrees)
print('Trainable Parameters:', params)
Degree 0: ['1']
Degree 1: ['1' 'bmi']
Degree 2: ['1' 'bmi' 'bmi^2']
Degree 3: ['1' 'bmi' 'bmi^2' 'bmi^3']
Degree 4: ['1' 'bmi' 'bmi^2' 'bmi^3' 'bmi^4']
Degree 5: ['1' 'bmi' 'bmi^2' 'bmi^3' 'bmi^4' 'bmi^5']
Degrees: [0, 1, 2, 3, 4, 5]
Trainable Parameters: [1, 2, 3, 4, 5, 6]